package edu.hit.crawler.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MetaParser {
    private boolean followed = true;
    private boolean indexed = true;

    public MetaParser(String html) {
	String content = "";
	// parse the meta tages
	Pattern p = Pattern.compile(".*<meta name=\"*robots\"+ content=.*",
		Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
	Matcher m = p.matcher(html);
	if (m.matches()) {
	    content = html.toLowerCase().split("content=")[1].replaceAll(
		    "[^a-z|1-9|\\-|,]", " ").trim().split("\\s+")[0];
	}
	if (content.toLowerCase().contains("nofollow"))
	    followed = false;
	if (content.toLowerCase().contains("noindex")
		|| content.toLowerCase().contains("none"))
	    indexed = false;
    }

    public boolean isFollowed() {
	return followed;
    }

    public boolean isIndexed() {
	return indexed;
    }
}
